Virus Count Analysis

tidied by sean :)

Author

Ramon Lorenzo

Published

November 7, 2023

Code
library(tidyverse)
# devtools::install_github("vqv/ggbiplot")
library(ggbiplot)
library(plotly)
library(ggsci)
library(ggpubr)
library(pscl)
library(glmmTMB)
library(performance)
library(emmeans)
library(fitdistrplus)

virus_counts <- read_csv("data/Virus count - R21.csv") |>
  janitor::clean_names(replace=janitor:::mu_to_u) |> 
  mutate(donor = factor(donor),
         image = factor(image),
         virions = as.numeric(as.character(virions)),
         penetrating_virions = as.numeric(as.character(penetrating_virions)),
         percent_of_penetrating_virions = as.numeric(as.character(percent_of_penetrating_virions)))

# making some tactical deletions-- no percent of penetrating greater than 100; no donor 1905; no NA virions

virus_counts <- virus_counts |> 
  filter(donor != "1905") |> 
  filter(percent_of_penetrating_virions < 101) |>
  filter(!is.na(virions))

distribution check

based on some distribution evaluation things, it looks like lognormal is gonna fit our data the best:

Code
plotdist(virus_counts$virions, histo = TRUE, demp = TRUE)

Code
descdist(virus_counts$virions, discrete=FALSE, boot=500)

summary statistics
------
min:  0   max:  142 
median:  7 
mean:  12.57399 
estimated sd:  16.89708 
estimated skewness:  2.823426 
estimated kurtosis:  14.00686 
Code
# i guess this is a mostly lognormal distribution??
fnb<-fitdist(virus_counts$virions, "nbinom")
fp<-fitdist(virus_counts$virions, "pois")
plot.legend <- c( "nbinom", "pois")
denscomp(list(fnb,fp), legendtext = plot.legend)

Code
qqcomp(list(fnb,fp), legendtext = plot.legend)

Code
# based on qqplot-- poisson sucks
cdfcomp(list(fnb,fp), legendtext = plot.legend)

Code
ppcomp(list(fnb,fp), legendtext = plot.legend)

Code
# idk what a pp plot is either but im just gonna say poisson has not been killing it here

gofstat(list(fnb,fp))
Chi-squared statistic:  14.0007 1.924514e+14 
Degree of freedom of the Chi-squared distribution:  14 15 
Chi-squared p-value:  0.4496586 0 
   the p-value may be wrong with some theoretical counts < 5  
Chi-squared table:
      obscounts theo 1-mle-nbinom theo 2-mle-pois
<= 0         82          88.61887    2.315321e-03
<= 1         53          57.23898    2.911282e-02
<= 2         57          45.63496    1.830322e-01
<= 3         37          38.68600    7.671484e-01
<= 4         35          33.77116    2.411529e+00
<= 5         38          29.99192    6.064509e+00
<= 6         32          26.93826    1.270918e+01
<= 8         47          46.60377    5.871124e+01
<= 10        43          38.99262    1.131657e+02
<= 12        32          33.07158    1.475552e+02
<= 14        31          28.31203    1.386151e+02
<= 18        32          45.54323    1.524715e+02
<= 22        35          34.43936    3.280324e+01
<= 27        32          31.94795    3.430776e+00
<= 36        28          37.12506    8.036004e-02
<= 48        28          26.23579    1.197628e-05
> 48         27          25.84847    3.787970e-12

Goodness-of-fit criteria
                               1-mle-nbinom 2-mle-pois
Akaike's Information Criterion     4731.874   13375.99
Bayesian Information Criterion     4740.886   13380.49

Initial Modeling

so let’s make a lognormal model!

Code
initial_model <- lme4::lmer(virions ~ condition*tissue+(1|donor), virus_counts)

plot(check_distribution(initial_model))

Code
zinbm0<-glmmTMB(virions ~ condition*tissue+(1|donor), 
                family="nbinom2",
                ziformula=~1,data = virus_counts)

zinbm1<-glmmTMB(virions ~ tissue*condition+(1|donor), 
                family="nbinom2",
                data = virus_counts)

zinbm2<-glmmTMB(virions ~ condition*tissue+(1|donor), 
                family="nbinom2",
                ziformula=~tissue,data = virus_counts)

anova(zinbm0,zinbm1,zinbm2)
Data: virus_counts
Models:
zinbm1: virions ~ tissue * condition + (1 | donor), zi=~0, disp=~1
zinbm0: virions ~ condition * tissue + (1 | donor), zi=~1, disp=~1
zinbm2: virions ~ condition * tissue + (1 | donor), zi=~tissue, disp=~1
       Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)   
zinbm1  6 4392.4 4419.5 -2190.2   4380.4                            
zinbm0  7 4386.6 4418.1 -2186.3   4372.6 7.8505      1   0.005081 **
zinbm2  8 4388.2 4424.2 -2186.1   4372.2 0.3858      1   0.534516   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
model_performance(zinbm0)
# Indices of model performance

AIC      |     AICc |      BIC | R2 (cond.) | R2 (marg.) |   ICC |   RMSE | Sigma | Score_log | Score_spherical
---------------------------------------------------------------------------------------------------------------
4386.570 | 4386.739 | 4418.110 |      0.386 |      0.011 | 0.379 | 12.199 | 1.709 |    -3.243 |           0.025
Code
check_model(zinbm0)

Code
compare_performance(zinbm0,zinbm1,zinbm2)
# Comparison of Model Performance Indices

Name   |   Model |  AIC (weights) | AICc (weights) |  BIC (weights) | R2 (cond.) | R2 (marg.) |   ICC |   RMSE | Sigma | Score_log | Score_spherical
----------------------------------------------------------------------------------------------------------------------------------------------------
zinbm0 | glmmTMB | 4386.6 (0.667) | 4386.7 (0.671) | 4418.1 (0.642) |      0.386 |      0.011 | 0.379 | 12.199 | 1.709 |    -3.243 |           0.025
zinbm1 | glmmTMB | 4392.4 (0.036) | 4392.5 (0.037) | 4419.5 (0.328) |      0.567 |      0.022 | 0.558 | 12.236 | 1.458 |    -3.216 |           0.029
zinbm2 | glmmTMB | 4388.2 (0.297) | 4388.4 (0.292) | 4424.2 (0.030) |      0.385 |      0.011 | 0.379 | 12.191 | 1.702 |    -3.242 |           0.025

Then we can compare EMMs of our best model, binom0 :

Code
t <- pairs(emmeans(zinbm0, ~ condition))
s <- pairs(emmeans(zinbm0, ~ tissue))
t.s <- pairs(emmeans(zinbm0, ~ tissue | condition))
s.t <- pairs(emmeans(zinbm0, ~ condition | tissue))
rbind(t.s, s.t,adjust="fdr")
 condition     tissue contrast                    estimate    SE  df z.ratio
 Circumcised   .      Glans - Shaft                 -0.308 0.104 Inf  -2.961
 Uncircumcised .      Glans - Shaft                  0.224 0.109 Inf   2.059
 .             Glans  Circumcised - Uncircumcised   -0.385 0.320 Inf  -1.205
 .             Shaft  Circumcised - Uncircumcised    0.146 0.320 Inf   0.456
 p.value
  0.0123
  0.0789
  0.3040
  0.6482

Results are given on the log (not the response) scale. 
P value adjustment: fdr method for 4 tests 
Code
check_model(zinbm0)

Code
ggboxplot(virus_counts, x = "condition", 
          y = "virions",
          combine = TRUE,
          color = "tissue", palette = "npg",
          ylab = "virions", 
          add = "jitter",                              # Add dotplot
          add.params = list(binwidth = 0.2))+scale_y_log10()

what’s this MEAN?

look at some means of data:

Code
mean_virus_counts <-virus_counts %>%
  group_by(condition, tissue, donor) %>%
  summarise(mean_virions = mean(virions)) 

mean_model <- lme4::lmer(log10(mean_virions) ~ condition*tissue+(1|donor), 
                    data = mean_virus_counts)

plot(check_distribution(mean_model))

Code
t <- pairs(emmeans(mean_model, ~ condition))
s <- pairs(emmeans(mean_model, ~ tissue))
t.s <- pairs(emmeans(mean_model, ~ tissue | condition))
s.t <- pairs(emmeans(mean_model, ~ condition | tissue))
rbind(t.s, s.t,adjust="fdr")
 condition     tissue contrast                    estimate    SE   df t.ratio
 Circumcised   .      Glans - Shaft                 -0.209 0.126 30.4  -1.656
 Uncircumcised .      Glans - Shaft                  0.134 0.138 28.2   0.973
 .             Glans  Circumcised - Uncircumcised   -0.216 0.171 51.2  -1.269
 .             Shaft  Circumcised - Uncircumcised    0.127 0.172 51.8   0.738
 p.value
  0.4204
  0.4515
  0.4204
  0.4638

Degrees-of-freedom method: kenward-roger 
Results are given on the log10 (not the response) scale. 
P value adjustment: fdr method for 4 tests 
Code
check_model(mean_model)

Code
qqnorm(resid(mean_model))
qqline(resid(mean_model))

Code
hist(resid(mean_model))

Code
plot(fitted(mean_model),resid(mean_model))
abline(h=0)

Code
mean_virus_counts %>%
  ggplot(aes(x=condition,y=mean_virions,color=tissue))+
  geom_boxplot(outlier.shape = NA)+
  geom_point(position = position_jitterdodge())+
  theme_pubr()+scale_color_npg()+
  facet_grid(.~tissue, scales = "free")

Penetrators

Code
Penetrators <- virus_counts |>
  dplyr::select(condition, tissue, group, percent_of_penetrating_virions)

lm_penetrators <- gamlss::gamlss((percent_of_penetrating_virions/100) ~ condition*tissue,
                                 # beta inflated something something
                                 family = gamlss.dist::BEINF(), 
                                 data = Penetrators)
GAMLSS-RS iteration 1: Global Deviance = 968.005 
GAMLSS-RS iteration 2: Global Deviance = 965.7493 
GAMLSS-RS iteration 3: Global Deviance = 965.7472 
GAMLSS-RS iteration 4: Global Deviance = 965.7472 
Code
plot(check_distribution(lm_penetrators))

Code
# Check on this:
# binned_residuals(lm_penetrators, residuals = "response")
# check_model(lm_penetrators)

t <- pairs(emmeans(lm_penetrators, ~ condition))
s <- pairs(emmeans(lm_penetrators, ~ tissue))
t.s <- pairs(emmeans(lm_penetrators, ~ tissue | condition))
s.t <- pairs(emmeans(lm_penetrators, ~ condition | tissue))
rbind(t.s,s.t,adjust="FDR")
 condition     tissue contrast                    estimate    SE  df z.ratio
 Circumcised   .      Glans - Shaft                  0.179 0.105 Inf   1.710
 Uncircumcised .      Glans - Shaft                  0.124 0.113 Inf   1.098
 .             Glans  Circumcised - Uncircumcised   -0.226 0.109 Inf  -2.073
 .             Shaft  Circumcised - Uncircumcised   -0.281 0.109 Inf  -2.580
 p.value
  0.1164
  0.2722
  0.0764
  0.0395

Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 4 tests 
Code
qqnorm(resid(lm_penetrators))
qqline(resid(lm_penetrators))

Code
hist(resid(lm_penetrators))

Code
plot(fitted(lm_penetrators),resid(lm_penetrators))
abline(h=0)

Code
# maybe see higher of pct of penetrating virions in uncircumcised?
virus_counts%>%
  ggplot(aes(x=condition,y=percent_of_penetrating_virions,color=condition))+
  geom_boxplot(outlier.shape = NA)+
  geom_point(position = position_jitterdodge())+
  theme_pubr()+scale_color_npg()+facet_grid(.~tissue,scales = "free")

Code
mean_penetrators <-virus_counts %>%
  group_by(condition, tissue, donor) %>%
  summarise(mean_percent_of_penetrating_virions = mean(percent_of_penetrating_virions)) 

Depth

analysis of how far the virions penetrated

Code
Depth<-virus_counts[,c(1:4,9:59)]

Depth<-Depth%>%
  dplyr::rename("x9" = "penetration_depths_um") %>%
  pivot_longer(x9:x59,
               names_to = "particle",
               values_to = "penetration_depth") |>
  dplyr::select(!particle) |>
  drop_na(penetration_depth) |>
  mutate(penetration_depth = as.numeric(as.character(penetration_depth)))


mean_depth <- Depth %>%
  group_by(condition, tissue, donor) %>%
  summarise(mean_depth = mean(penetration_depth))

Depth%>%
  ggplot(aes(x=condition,y=penetration_depth,color=tissue))+
  geom_boxplot(outlier.shape = NA)+
  geom_point(position = position_jitterdodge())+
  theme_pubr()+scale_color_npg()+
  facet_grid(.~tissue,scales = "free")+scale_y_log10()

Code
mean_depth%>%
  ggplot(aes(x=condition,y=mean_depth,color=tissue))+
  geom_boxplot(outlier.shape = NA)+
  geom_point(position = position_jitterdodge())+
  theme_pubr()+scale_color_npg()+
  facet_grid(.~tissue,scales = "free")

Code
fl<-fitdist(mean_depth$mean_depth, "lnorm")
fn<-fitdist(mean_depth$mean_depth, "norm")
fg<-fitdist(mean_depth$mean_depth, "gamma")
plot.legend <- c("normal", "gamma", "lognorm")
denscomp(list(fn,fg,fl), legendtext = plot.legend)

Code
qqcomp(list(fn,fg,fl), legendtext = plot.legend)

Code
cdfcomp(list(fn,fg,fl), legendtext = plot.legend)

Code
ppcomp(list(fn,fg,fl), legendtext = plot.legend)

Code
gofstat(list(fn,fg,fl))
Goodness-of-fit statistics
                             1-mle-norm 2-mle-gamma 3-mle-lnorm
Kolmogorov-Smirnov statistic  0.1033860  0.06941775  0.05635467
Cramer-von Mises statistic    0.1500214  0.03586457  0.02445954
Anderson-Darling statistic    1.0541724  0.29134187  0.18752228

Goodness-of-fit criteria
                               1-mle-norm 2-mle-gamma 3-mle-lnorm
Akaike's Information Criterion   320.7036    310.6479    309.0504
Bayesian Information Criterion   324.9898    314.9342    313.3367
Code
lm_mean_depth <- lme4::lmer(log10(mean_depth) ~ condition*tissue +(1|donor), data = mean_depth)
plot(check_distribution(lm_mean_depth))

Code
binned_residuals(lm_mean_depth)
Warning: About 88% of the residuals are inside the error bounds (~95% or higher would be good).
Code
check_model(lm_mean_depth)

Code
t <- pairs(emmeans(lm_mean_depth, ~ condition))
s <- pairs(emmeans(lm_mean_depth, ~ tissue))
t.s <- pairs(emmeans(lm_mean_depth, ~ tissue | condition))
s.t <- pairs(emmeans(lm_mean_depth, ~ condition | tissue))
rbind(t.s,s.t,adjust="FDR")
 condition     tissue contrast                    estimate     SE   df t.ratio
 Circumcised   .      Glans - Shaft                 0.0664 0.0481 31.2   1.380
 Uncircumcised .      Glans - Shaft                 0.0292 0.0533 28.6   0.548
 .             Glans  Circumcised - Uncircumcised   0.0494 0.0538 58.2   0.919
 .             Shaft  Circumcised - Uncircumcised   0.0122 0.0545 58.3   0.225
 p.value
  0.7101
  0.7842
  0.7237
  0.8230

Degrees-of-freedom method: kenward-roger 
Results are given on the log10 (not the response) scale. 
P value adjustment: fdr method for 4 tests 
Code
qqnorm(resid(lm_mean_depth))
qqline(resid(lm_mean_depth))

Code
hist(resid(lm_mean_depth))

Code
plot(fitted(lm_mean_depth),resid(lm_mean_depth))
abline(h=0)